Author: Madalina-Alina Racovita, 1st year master's student on Computational Optimization at Faculty of Computer Science, UAIC, Iasi, Romania

!pip install feature_engine
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np
import researchpy as rp
import scipy.stats as stats
from statistics import median
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from collections import Counter
from sklearn import metrics
warnings.filterwarnings('ignore')
matplotlib.style.use('ggplot')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
The dataframes are going to be loaded in an unprocessed version since, for this task the feature selection has to be made using empirical and encapsulated manners. It is specified the fact that the features UnitsInBuilding and Stories are going to be removed from the dataframe, the regression task is going to be pursued only of the real estates geographically localized in the Washington state.
os.listdir('./../Data')
df_rcon = pd.concat([pd.read_csv("./../Data/RCON_12011.assessor.tsv", sep = "\t"),
pd.read_csv("./../Data/RCON_53033.assessor.tsv", sep = "\t")])
df_rsfr = pd.concat([pd.read_csv("./../Data/RSFR_12011.assessor.tsv", sep = "\t"),
pd.read_csv("./../Data/RSFR_53033.assessor.tsv", sep = "\t")])
df_rcon['isRCON'] = 1
df_rsfr['isRCON'] = 0
df = pd.concat([df_rcon, df_rsfr])
del df['UnitsInBuilding']
del df['Stories']
df = df[df['State'] == 'WA']
The columns that have a single value in the entire dataframe are going to be dropped since they have no predictive value. Their names were detected during EDA.
columns_constant_features = ['AtticSqft', 'IsFixer', 'GarageNoOfCars', 'EffectiveYearBuilt', 'TotalRooms', 'State']
for current in columns_constant_features:
del df[current]
The features that have a missing percentage greater than 95% are going to be dropped, since tey have no predictive value and increase the model complexity with no reason.
column_miss_perc_ge_95 = ['DeedType', 'RoofCode', 'BuildingShapeCode', 'City', 'StructureCode']
for column in column_miss_perc_ge_95:
del df[column]
The missing values for the object type columns are going to be replaced with an empty string, the ones for numerical columns with 0, except from SellPrice which is going to be replaced with the mean corresponding to the real estates category from which is belonging: residential or RCON.
features_with_missing_values = ['BuildingCode', 'GarageCarportCode', 'PatioPorchCode', 'PoolCode', 'Zonning', \
'PropTaxAmount', 'FoundationCode', 'ExteriorCode', 'CoolingCode', 'HeatingCode', \
'HeatingSourceCode', 'View', 'DocType', 'TransType', 'DistressCode', 'SellPrice']
object_miss_values_features = []
numeric_miss_values_features = []
types = df[features_with_missing_values].dtypes
for i in range(len(types)):
if types[i] == object:
object_miss_values_features.append(features_with_missing_values[i])
else:
numeric_miss_values_features.append(features_with_missing_values[i])
for column in object_miss_values_features:
df[column] = df[column].fillna('')
rcon_medium_sellprice = df_rcon['SellPrice'].mean()
rsfr_medium_sellprice = df_rsfr['SellPrice'].mean()
for column in numeric_miss_values_features:
if column != 'SellPrice':
df[column] = df[column].fillna(0)
prices = []
for (index, row) in df.iterrows():
added = False
if pd.isnull(row['SellPrice']):
row['SellPrice'] = row['LastSalePrice']
if row['SellPrice'] == 0:
if row['isRCON'] == 1:
prices.append(rcon_medium_sellprice)
else:
prices.append(rsfr_medium_sellprice)
elif row['SellPrice'] != 0:
prices.append(row['SellPrice'])
df['SellPrice'] = prices
df.loc[(df['isRCON'] == 1) & (df['LastSalePrice'] == 0), 'LastSalePrice'] = rcon_medium_sellprice
df.loc[(df['isRCON'] == 0) & (df['LastSalePrice'] == 0), 'LastSalePrice'] = rsfr_medium_sellprice
bool_type_columns = df.select_dtypes(include=bool).columns.tolist()
for column in bool_type_columns:
df[column] = df[column].apply(lambda x: 0 if x == False else 1)
df.head()
The real estates with a price bigger than 2.5 million US dollars are going to be dropped since they are outliers and we want a robust model, which will learn generalities not particularities.
df = df.drop(df[df['SellPrice'] > 2500000].index)
Some features are going to be rebuilt for increasing their predictive power.
df['SellDate_Year'] = df['SellDate'].apply(lambda x: int(x[:4]))
df['SellDate_Month'] = df['SellDate'].apply(lambda x: int(x[5:7]))
df['SellDate_Day'] = df['SellDate'].apply(lambda x: int(x[8:]))
del df['SellDate']
df['StatusDate'] = df['StatusDate'].apply(lambda x: str(x.split()[0]))
# df['SellPricePerLivingSqft'] = df['SellPrice'] / df['LivingSqft']
del df['LastSalePrice']
For the regression task, there are going to be kept only the records from 2009.
df = df[df['SellDate_Year'] == 2009]
df.shape
Before constructing a Machine Learning model it is important to filter the features, since the features uncorrelated with the target class, complicate useless the model and increase the computational complexity. It is desirable a model with as few features as possible, robust and with a great predictive power.
For instance, it can be tolerated a small decrease of acccuracy, if the number of features used in the second model is a lot smaller.
The numerical features are going to be filtered based on the Pearson's correlation coefficient with the target class, SellPrice.
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(30, 16))
sns.heatmap(df.corr(), annot=True, fmt=".2f", mask=mask, cmap='magma')
plt.show()
Are going to be kept the features with a correlation coefficient with the SellPrice variable greater than 0.08.
relevant_numerical_features = []
for key in corr['SellPrice'].keys():
if abs(corr['SellPrice'][key]) > 0.08 and key != 'SellPrice':
relevant_numerical_features.append(key)
relevant_numerical_features
str(len(corr['isRCON']) - len(relevant_numerical_features)) + ' numerical features were removed due to small correlation from a ' \
+ 'total of ' + str(len(corr['isRCON'])) + ' numerical features'
Let's take a glance on the features that were eliminated.
list(set(corr['SellPrice'].keys()).difference(set(relevant_numerical_features)))
The categorical features are going to be filterd as well, but before that we have to collect their names.
character_type_features = []
types = df.dtypes
for i in range(len(types)):
if types[i] == object:
character_type_features.append(list(df)[i])
len(character_type_features)
The categorical features are going to be filtered by pursuing some one-way ANOVA tests. The one-way analysis of variance (ANOVA) is used to determine whether there are any statistically significant differences between the means of three or more independent (unrelated) groups.
The one-way ANOVA compares the means between the groups you are interested in and determines whether any of those means are statistically significantly different from each other. Specifically, it tests the null hypothesis:
The H0 (Null Hypothesis): $\mu_1 = \mu_2 = \dots = \mu_k$, where k is the cardinal of the categorical feature's set of unique values.
If, however, the one-way ANOVA returns a statistically significant result (p-value as close as possible to 0, preferable smaller than 0.05), we accept the alternative hypothesis (HA), which is that there are at least two group means that are statistically significantly different from each other.
At this point, it is important to realize that the one-way ANOVA is an omnibus test statistic and cannot tell you which specific groups were statistically significantly different from each other, only that at least two groups were. Still, the ANOVA test will be used in order to specify if there is a relationship between SellPrice and a specific characteristic for that real estate like Quality.
def split_continuous_into_groups(categorical_feature):
unique_values = list(df[categorical_feature].unique())
return [list(df[df[categorical_feature] == value]['SellPrice']) for value in unique_values]
stats.f_oneway(*split_continuous_into_groups('Quality'))
There are going to be kept the features with a p_value smaller than 0.05.
len(character_type_features)
relevant_categorical_features = []
for nominal_feature in character_type_features:
p_value = stats.f_oneway(*split_continuous_into_groups(nominal_feature))[1]
if p_value <= 0.05:
relevant_categorical_features.append(nominal_feature)
print(nominal_feature + ' & SellPrice one-way ANOVA p-value = ' + str(p_value))
'The features that were removed due to one-way ANOVA tests results were: ' + \
str(set(character_type_features).difference(set(relevant_categorical_features)))
relevant_categorical_features
relevant_features = relevant_numerical_features + relevant_categorical_features + ['SellPrice']
len(relevant_features)
Therefore, after the filtration based on the Pearson's correlation coefficient and on one-way ANOVA tests values 36 features were kept as prectors for the ML model.
The dataframe is going to be encoded so that the categorical features are going to be transformed into numerical ones, by a simmilar One-Hot encoding process. The numerical variables are going to be kept unmodified.
encoded_df = pd.get_dummies(df[relevant_features + ['SellDate_Month']])
encoded_df['SellPrice'] = df['SellPrice']
len(list(encoded_df))
encoded_df.head(3)
Since the number of features from the encoded dataframe is quite big and this will increase the complexity of the computations for the ML models, I am going to filter as well these features, by removing the ones with a Pearson's correlation coefficient smaller than 0.08, analogically with the previous filtration step on the initial dataframe.
corr = encoded_df.corr()
relevant_features_from_encoded_df = []
for key in corr['SellPrice'].keys():
if abs(corr['SellPrice'][key]) > 0.08 and key != 'SellPrice':
relevant_features_from_encoded_df.append(key)
len(relevant_features_from_encoded_df)
From a total of 492 variables, 67 are going to be kept for model building. The month in which the real estates are sold won't be taken into consideration since it is not correlated enough with the target variable. It will be included for now in the dataframes for the temporal analysis.
encoded_df = encoded_df[relevant_features_from_encoded_df + ['SellDate_Month', 'SellPrice']]
The regression task mentions the fact that for the test dataframes there are going to be considered the observations from June and November 2009, from WA, while the training dataframes are going to be formed of the observations from a few months earlier. For instance if the test dataframe is formed of the records from June 2009, the training dataframe is going to be constituted by the records from January untill end of May 2009.
Let's take a look on the histogram of the monthly sales from 2009.
sns.distplot(df['SellDate_Month'], hist = True, kde = True, kde_kws = {'shade': True, 'linewidth': 3}, color='darkgreen')
encoded_df[encoded_df['SellDate_Month']<6]['SellDate_Month'].shape
encoded_df[encoded_df['SellDate_Month']==6]['SellDate_Month'].shape
encoded_df[(encoded_df['SellDate_Month']>7) & (encoded_df['SellDate_Month']<11)]['SellDate_Month'].shape
encoded_df[encoded_df['SellDate_Month']==11]['SellDate_Month'].shape
For maintaining a proportion of 3/4 train 1/4 test with aproximation, there are going to be considered as a training set the records from January until end of May (967 records) for predicting the sell prices for the houses sold in June 2009 (345 records). For predicting the sell prices for the properties sold in November 2009 (332 records), the training set will be formed of the records from August until October 2009 (1014 observations).
train_test = {
'for_June_prediction': {'X_train' : encoded_df[encoded_df['SellDate_Month']<6][relevant_features_from_encoded_df],
'y_train': encoded_df[encoded_df['SellDate_Month']<6]['SellPrice'],
'X_test' : encoded_df[encoded_df['SellDate_Month']==6][relevant_features_from_encoded_df],
'y_test': encoded_df[encoded_df['SellDate_Month']==6]['SellPrice']
},
'for_November_prediction': {'X_train' : encoded_df[(encoded_df['SellDate_Month']>7) & \
(encoded_df['SellDate_Month']<11)] \
[relevant_features_from_encoded_df],
'y_train' : encoded_df[(encoded_df['SellDate_Month']>7) & \
(encoded_df['SellDate_Month']<11)]['SellPrice'],
'X_test' : encoded_df[encoded_df['SellDate_Month']==11][relevant_features_from_encoded_df],
'y_test' : encoded_df[encoded_df['SellDate_Month']==11]['SellPrice']
}
}
Concerning the evaluation metrics for each regressions model will be reported the absolute and the relative error, the r squared and RMSE and RMSLE scores.
Absolute Error = |Actual Value - Measured Value| \ Relative Error = Absolute Error / Known Value (The relative error can be expressed as well as a percentage.)
The following implementation will compute the RMSE score that consists in the evaluation metric for the final model.
I am going to use as well the RMSLE score (Root Mean Squared Logarithmic Error) (by computing the logarithm over predicted and target values, errors occurred from estimating badly an expensive house will have the same impact on the model as well as errors occurred from bad cheap house estimations): $ \epsilon_{RMSLE} = \sqrt{\frac{1}{n} \sum\limits_{i=1}^{n}{(log(p_i + 1) - log(a_i + 1))^2}}$
def rmse(y_pred, y_test):
return np.sqrt(sum([(y_pred[i] - y_test[i])**2 for i in range(len(y_test))]) / len(y_test))
def rmsle(y_pred, y_test):
for i in range(len(y_pred)):
if y_pred[i] < 0:
y_pred[i] = 0
print(y_pred[i])
return np.sqrt(sum([(np.log(y_pred[i] + 1) - np.log(y_test[i] + 1))**2 for i in range(len(y_test))]) / len(y_test))
def mean_absolute_percentage_error(y_test, y_pred):
return np.mean([np.abs((y_test[i] - y_pred[i]) / y_test[i]) for i in range(len(y_test))])*100
Given a data set of n statistical units, a linear regression model assumes that the relationship between the dependent variable y and the p-vector of regressors x is linear. This relationship is modeled through a disturbance term or error variable ε — an unobserved random variable that adds "noise" to the linear relationship between the dependent variable and regressors. Thus the model takes the form y = $ X \beta + \epsilon $
Multiple linear regression (MLR), also known simply as multivariate regression, is a statistical technique that uses more explanatory variables to predict the outcome of a response variable.
In essence, multiple regression is the extension of ordinary least-squares (OLS) regression that involves more than one explanatory variable.
The Formula for Multiple Linear Regression is: $ y_i = \beta_0 + \beta_1 x_{i_1} + \beta_2 x_{i_2} + \dots + \beta_p x_{i_p} + \epsilon $, for $ i \in \overline{1,n} $ where:
The Recursive Feature Elimination (RFE) method works by recursively removing attributes and building a model based on multivariate regression with those attributes that remain. It uses accuracy metric to rank the feature according to their importance. The RFE method takes the model to be used and the number of required features as input. It then gives the ranking of all the variables, 1 being most important. It also gives its support, True being relevant feature and False being irrelevant feature.
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
def get_rfe_columns(which_prediction):
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 30) # select the best subset with size 15
#Transforming data using RFE
x_rfe = rfe.fit_transform(train_test[which_prediction]['X_train'], train_test[which_prediction]['y_train'])
#Fitting the data to model
model.fit(x_rfe,train_test[which_prediction]['y_train'])
print(rfe.support_)
print(rfe.ranking_)
def get_selected_columns(support_rfe, df):
selected_columns = []
for i in range(len(list(df))):
if support_rfe[i] == True:
selected_columns.append(list(df)[i])
return selected_columns
selected_columns = get_selected_columns(rfe.support_, train_test[which_prediction]['X_train'])
return selected_columns
selected_columns_june = get_rfe_columns('for_June_prediction')
selected_columns_november = get_rfe_columns('for_November_prediction')
linear_regression_model = LinearRegression()
def get_results_given_model(model_june, model_november, selected_columns_june, selected_columns_november):
r_sq = []
model_june.fit(train_test['for_June_prediction']['X_train'][selected_columns_june],
train_test['for_June_prediction']['y_train'])
r_sq.append(model_june.score(train_test['for_June_prediction']['X_test'][selected_columns_june],
train_test['for_June_prediction']['y_test']))
y_june = [list(model_june.predict(train_test['for_June_prediction']['X_test'][selected_columns_june])),
list(train_test['for_June_prediction']['y_test'])]
model_november.fit(train_test['for_November_prediction']['X_train'][selected_columns_november],
train_test['for_November_prediction']['y_train'])
r_sq.append(model_november.score(train_test['for_November_prediction']['X_test'][selected_columns_november],
train_test['for_November_prediction']['y_test']))
y_november = [list(model_november.predict(train_test['for_November_prediction']['X_test'][selected_columns_november])),
list(train_test['for_November_prediction']['y_test'])]
return y_june, y_november, r_sq
def mean_relative_error(y_test, y_pred):
return np.mean([abs(y_test[i] - y_pred[i]) / y_test[i] for i in range(len(y_test))])
def median_relative_error(y_test, y_pred):
return median([abs(y_test[i] - y_pred[i]) / y_test[i] for i in range(len(y_test))])
from sklearn.metrics import mean_absolute_error
def get_evaluation_metrics(y_pred, y_test, r_sq):
textstr = '\n'.join((r'$\mathrm{Correlation}=%.5f$' % (np.corrcoef(y_pred, y_test)[0,1], ),
r'$\mathrm{RMSE}=%.5f$' % (rmse(y_pred, y_test), ),
r'$\mathrm{RMSLE}=%.5f$' % (rmsle(y_pred, y_test), ),
r'$\mathrm{Mean\ absolute\ error}=%.5f$' % (mean_absolute_error(y_test, y_pred), ),
r'$\mathrm{Mean\ relative\ error}=%.5f$' % (mean_relative_error(y_test,y_pred) , ),
r'$\mathrm{Median\ relative\ error}=%.5f$' % (median_relative_error(y_test,y_pred) , ),
r'$\mathrm{R-squared}=%.5f$' % (r_sq, ),
r'$\mathrm{Median}=%.5f$' % (median([(y_pred[i] - y_test[i]) for i in range(len(y_pred))])),
r'$\mathrm{Mean}=%.5f$' % (np.mean(list(np.array(y_pred) - np.array(y_test))))))
return textstr
def return_regression_results(y_june, y_november, cmap_index, model_naming, r_sq):
cmaps = ['viridis', 'plasma', 'inferno', 'cubehelix']
fig, ax = plt.subplots(1,2,figsize=(20,20))
plt.subplot(2,2,1)
plt.title('June sell prices predictions using ' + model_naming)
points = plt.scatter(list(y_june[0]), list(y_june[1]), c=y_june[1], s=20, cmap=cmaps[cmap_index[0]], marker="1")
plt.colorbar(points)
ax = sns.regplot(x=list(y_june[0]), y=list(y_june[1]), scatter=False, color=".1")
plt.xlabel('predicted price', fontsize=12)
plt.ylabel('target price', fontsize=12)
textstr = get_evaluation_metrics(list(y_june[0]), list(y_june[1]), r_sq[0])
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=10,
verticalalignment='top', bbox=props)
plt.subplot(2,2,2)
plt.title('November sell prices predictions using ' + model_naming)
points = plt.scatter(list(y_november[0]), list(y_november[1]), c=y_november[1], s=20, cmap=cmaps[cmap_index[1]], marker="1")
plt.colorbar(points)
ax = sns.regplot(x=list(y_november[0]), y=list(y_november[1]), scatter=False, color=".1")
plt.xlabel('predicted price', fontsize=12)
plt.ylabel('target price', fontsize=12)
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
textstr = get_evaluation_metrics(list(y_november[0]), list(y_november[1]), r_sq[1])
ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=10,
verticalalignment='top', bbox=props)
plt.show()
from matplotlib import colors
def plot_histogram_of_predicted_real(y_june, y_november, model_naming, r_sq):
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
N, bins, patches = axs[0].hist(np.array(y_june[0]) - np.array(y_june[1]), bins=50, alpha=0.6)
axs[0].set_title('Histogram of predicted - real for June using ' + model_naming)
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.plasma(norm(thisfrac))
thispatch.set_facecolor(color)
textstr = get_evaluation_metrics(list(y_june[0]), list(y_june[1]), r_sq[0])
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
axs[0].text(0.05, 0.95, textstr, transform=axs[0].transAxes, fontsize=10,
verticalalignment='top', bbox=props)
axs[0].axvline(np.mean(list(np.array(y_june[0]) - np.array(y_june[1]))), color='red', linestyle='dashed', linewidth=1)
axs[0].axvline(median(np.array(y_june[0]) - np.array(y_june[1])), color='blue', linestyle='dashed', linewidth=1)
N, bins, patches = axs[1].hist(np.array(y_november[0]) - np.array(y_november[1]),bins = 50, alpha=0.6)
axs[1].set_title('Histogram of predicted - real for November using ' + model_naming)
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.viridis(norm(thisfrac))
thispatch.set_facecolor(color)
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
textstr = get_evaluation_metrics(list(y_november[0]), list(y_november[1]), r_sq[1])
axs[1].text(0.05, 0.95, textstr, transform=axs[1].transAxes, fontsize=10,
verticalalignment='top', bbox=props)
axs[1].axvline(np.mean(list(np.array(y_november[0]) - np.array(y_november[1]))), color='red',
linestyle='dashed', linewidth=1)
axs[1].axvline(median(np.array(y_november[0]) - np.array(y_november[1])), color='blue',
linestyle='dashed', linewidth=1)
plt.show()
def build_intervals(list_disc):
intervals = []
for i in range(len(list_disc)-1):
intervals.append('[' + str(list_disc[i]) + ', \n'+ str(list_disc[i+1]) + ']')
return intervals
def build_dicretized_df_real_price_errors(discretiser, y_pred, y_test):
discretized_df = pd.DataFrame({'PriceRange': y_test, 'Error': np.array(y_test) - np.array(y_pred)})
discretized_df['PriceRange'] = discretized_df['PriceRange'].apply(lambda x: round(x, 3))
disc = discretiser(10, variables=['PriceRange'])
disc.fit(discretized_df)
discretized_df= disc.transform(discretized_df)
intervals = build_intervals(disc.binner_dict_['PriceRange'])
discretized_df['PriceRange'] = discretized_df['PriceRange'].apply(lambda x: intervals[x])
return discretized_df
from feature_engine import discretisers as dsc
def show_boxplots_to_check_for_biases(discretiser, y_june, y_november, model_naming, r_sq):
if discretiser == dsc.EqualWidthDiscretiser:
title = 'Errors over equal-length price intervals using '
else:
title = 'Errors over equal-ferquency price intervals using '
fig, axs = plt.subplots(2, 1, figsize=(20, 16))
discretized_df = build_dicretized_df_real_price_errors(discretiser, y_june[0], y_june[1])
sns.boxplot(x="PriceRange", y="Error", data=discretized_df, palette="Set1", ax=axs[0])
axs[0].set_title(title + model_naming + ' - June predictions')
textstr = get_evaluation_metrics(list(y_june[0]), list(y_june[1]), r_sq[0])
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
axs[0].text(0.05, 0.95, textstr, transform=axs[0].transAxes, fontsize=10,
verticalalignment='top', bbox=props)
axs[0].axhline(0, color='red', linestyle='dashed', linewidth=1)
discretized_df = build_dicretized_df_real_price_errors(discretiser, y_november[0], y_november[1])
sns.boxplot(x="PriceRange", y="Error", data=discretized_df, palette="Set1", ax=axs[1])
axs[1].hist(np.array(y_november[0]) - np.array(y_november[1]),bins = 50, alpha=0.6)
axs[1].set_title(title + model_naming + ' - November predictions')
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
textstr = get_evaluation_metrics(list(y_november[0]), list(y_november[1]), r_sq[1])
axs[1].text(0.05, 0.95, textstr, transform=axs[1].transAxes, fontsize=10,
verticalalignment='top', bbox=props)
axs[1].axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.show()
y_june, y_november, r_sq = get_results_given_model(linear_regression_model, linear_regression_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
build_dicretized_df_real_price_errors(dsc.EqualWidthDiscretiser, y_june[0], y_june[1]).head()
return_regression_results(y_june, y_november, [2, 3], "\n multivariate regression without RFE", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n multivariate regression without RFE", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n multivariate regression without RFE", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n multivariate regression without RFE", r_sq)
y_june, y_november, r_sq = get_results_given_model(linear_regression_model, linear_regression_model,
selected_columns_june, selected_columns_november)
return_regression_results(y_june, y_november, [0, 1], "\n multivariate regression after RFE (30 features only)", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n multivariate regression after RFE (30 features only)", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n multivariate regression after RFE (30 features only)", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n multivariate regression after RFE (30 features only)", r_sq)
Lasso, or Least Absolute Shrinkage and Selection Operator adds a penalty for non-zero coefficients, but unlike ridge regression which penalizes sum of squared coefficients (the so-called L2 penalty), lasso penalizes the sum of their absolute values (L1 penalty). As a result, for high values of $ \lambda $ , many coefficients are exactly zeroed under lasso (this is why Lasso is computing sot of a feature selection as well).
Under lasso, the loss function is defined as: $ L_{lasso}(\widehat{\beta}) = \sum_{i=1}^{n}(y_i - x_i' \widehat{\beta})^2 + \lambda \sum_{j=1}^{m} |\widehat{\beta}_j | $.
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.1)
y_june, y_november, r_sq = get_results_given_model(lasso_model, lasso_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n Lasso regression (L1 penalty for feature selection)", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n Lasso regression (L1 penalty for feature selection)", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n Lasso regression (L1 penalty for feature selection)", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n Lasso regression (L1 penalty for feature selection)", r_sq)
from sklearn.neighbors import KNeighborsRegressor
rmsles_june = []
rmsles_november = []
for i in range(1, 30):
knn_model = KNeighborsRegressor(n_neighbors=i)
y_june, y_november, r_sq = get_results_given_model(knn_model, knn_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
rmsles_june.append(rmsle(y_june[0], y_june[1]))
rmsles_november.append(rmsle(y_november[0], y_november[1]))
fig, ax = plt.subplots(1,2,figsize=(20,10))
plt.subplot(2,2,1)
plt.plot(range(1,30), rmsles_june, color='blue',
linestyle='dashed', marker='o',
markerfacecolor='purple', markersize=4)
plt.title('RMSLE vs. K Value for June predictions')
plt.xlabel('K')
plt.ylabel('RMSLE')
plt.subplot(2,2,2)
plt.plot(range(1,30), rmsles_november, color='orange',
linestyle='dashed', marker='o',
markerfacecolor='green', markersize=4)
plt.title('RMSLE vs. K Value for November predictions')
plt.xlabel('K')
plt.ylabel('RMSLE')
plt.show()
best_k_june = np.argmin(rmsles_june) + 1
best_k_june
best_k_november = np.argmin(rmsles_november) + 1
best_k_november
knn_model_june = KNeighborsRegressor(n_neighbors=best_k_june)
knn_model_november = KNeighborsRegressor(n_neighbors=best_k_november)
y_june, y_november, r_sq = get_results_given_model(knn_model_june, knn_model_november,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n tunned_k-NN regression", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n tunned_k-NN regression", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n tunned_k-NN regression", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n tunned_k-NN regression", r_sq)
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=0)
y_june, y_november, r_sq = get_results_given_model(dt_model, dt_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n decission trees without constraints on max_depth", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n decission trees without constraints on max_depth", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n decission trees without constraints on max_depth", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n decission trees without constraints on max_depth", r_sq)
dt_model_june = DecisionTreeRegressor(max_depth=10, random_state=0)
dt_model_november = DecisionTreeRegressor(max_depth=6, random_state=0)
y_june, y_november, r_sq = get_results_given_model(dt_model_june, dt_model_november,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n decission trees with constraints on max_depth", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n decission trees with constraints on max_depth", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n decission trees with constraints on max_depth", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n decission trees with constraints on max_depth", r_sq)
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(solver='lbfgs', alpha=1e-4,
hidden_layer_sizes=(100,),
random_state=1)
y_june, y_november, r_sq = get_results_given_model(mlp_model, mlp_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n simple MLP", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n simple MLP", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n simple MLP", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n simple MLP", r_sq)
mlp_model = MLPRegressor(solver='adam', activation='relu',
alpha=1e-4, #regularization parameter for L2
early_stopping=True,
random_state=1)
y_june, y_november, r_sq = get_results_given_model(mlp_model, mlp_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n MLP with ADAM, ReLu and a more complex architecture", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n MLP with ADAM, ReLu and a more complex architecture", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n MLP with ADAM, ReLu and a more complex architecture", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n MLP with ADAM, ReLu and a more complex architecture", r_sq)
from sklearn.svm import SVR
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.2)
y_june, y_november, r_sq = get_results_given_model(svr_model, svr_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n SVR with rbf kernel", r_sq)
plot_histogram_of_predicted_real(y_june, y_november,"\n SVR with rbf kernel", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n SVR with rbf kernel", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n SVR with rbf kernel", r_sq)
rmsles_june = []
rmsles_november = []
for kernel in ['poly', 'sigmoid', 'rbf']:
svr_model = SVR(kernel=kernel, C=1.0, epsilon=0.2)
y_june, y_november, r_sq = get_results_given_model(svr_model, svr_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
rmsles_june.append(rmsle(y_june[0], y_june[1]))
rmsles_november.append(rmsle(y_november[0], y_november[1]))
best_kernel_june = ['poly', 'sigmoid', 'rbf'][np.argmin(rmsles_june)]
best_kernel_june
best_kernel_november = ['poly', 'sigmoid', 'rbf'][np.argmin(rmsles_november)]
best_kernel_november
svr_model = SVR(kernel='poly', C=1.0, epsilon=0.2)
y_june, y_november, r_sq = get_results_given_model(svr_model, svr_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n SVR with polynomial kernel", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n SVR with polynomial kernel", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n SVR with polynomial kernel", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n SVR with polynomial kernel", r_sq)
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(max_depth=3, random_state=0)
y_june, y_november, r_sq = get_results_given_model(rf_model, rf_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n Random Forest max_depth=3", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n Random Forest max_depth=3", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n Random Forest max_depth=3", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n Random Forest max_depth=3", r_sq)
import tqdm
def get_rmsles_rf_regressor_given_n(n_estimators, max_depth=None):
rf_model = RandomForestRegressor(n_estimators = n_estimators, random_state = 42)
y_june, y_november, r_sq = get_results_given_model(rf_model, rf_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return [rmsle(y_june[0], y_june[1]), rmsle(y_november[0], y_november[1])]
rmsles_june = []
rmsles_november = []
for i in tqdm.tqdm(range(50)):
rmsles = get_rmsles_rf_regressor_given_n(i + 1)
rmsles_june.append(rmsles[0])
rmsles_november.append(rmsles[1])
best_n_june = np.argmin(rmsles_june) + 1
best_n_november = np.argmin(rmsles_november) + 1
print(best_n_june, best_n_november)
rf_model_june = RandomForestRegressor(n_estimators=best_n_june, max_depth=5, random_state=0)
rf_model_november = RandomForestRegressor(n_estimators=best_n_november, max_depth=5, random_state=0)
y_june, y_november, r_sq = get_results_given_model(rf_model_june, rf_model_november,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n Random Forest tunned n, max_depth=5", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n Random Forest tunned n, max_depth=5", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n Random Forest tunned n, max_depth=5", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n Random Forest tunned n, max_depth=5", r_sq)
from sklearn.ensemble import ExtraTreesRegressor
et_model = ExtraTreesRegressor(random_state=0, max_depth=5)
y_june, y_november, r_sq = get_results_given_model(et_model, et_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n ExtraTress max_depth=5", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n ExtraTress max_depth=5", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n ExtraTress max_depth=5", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n ExtraTress max_depth=5", r_sq)
from xgboost import XGBRegressor
xgbreg_model = XGBRegressor()
y_june, y_november, r_sq = get_results_given_model(xgbreg_model, xgbreg_model,
relevant_features_from_encoded_df,
relevant_features_from_encoded_df)
return_regression_results(y_june, y_november, [2, 3], "\n XGBoost", r_sq)
plot_histogram_of_predicted_real(y_june, y_november, "\n XGBoost", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualWidthDiscretiser, y_june, y_november,
"\n XGBoost", r_sq)
show_boxplots_to_check_for_biases(dsc.EqualFrequencyDiscretiser, y_june, y_november,
"\n XGBoost", r_sq)
pd.DataFrame({'Regressor':['Linear Regression without RFE',
'Linear Regression with RFE (30 features only)',
'Lasso regression',
'29-NN',
'Decission Tree Classifier (without prunning)',
'Decission Tree Classifier max_depth=10',
'MLP Adam + ReLu',
'SVR poly kernel',
'Random Forest',
'Extratrees',
'XGBoost'],
'RMSLE_June_pred': [0.36563, 0.44655, 0.36611, 0.35977, 0.39668, 0.49562, 0.40380,
0.55164, 0.33366, 0.34286, 0.33258]}).sort_values(by=['RMSLE_June_pred'],
ascending=True)
pd.DataFrame({'Regressor':['Linear Regression without RFE',
'Linear Regression with RFE (30 features only)',
'Lasso regression',
'25-NN',
'Decission Tree Classifier (without prunning)',
'Decission Tree Classifier max_depth=6',
'MLP Adam + ReLu',
'SVR poly kernel',
'Random Forest',
'Extratrees',
'XGBoost'],
'RMSLE_November_pred': [0.2876, 0.33572, 0.29091, 0.22076, 0.42064, 0.25956, 0.30435,
0.52066, 0.21840, 0.25824, 0.22247]}).sort_values(by=['RMSLE_November_pred'],
ascending=True)